/**
 * Copyright (C) 2010 Neofonie GmbH
 *
 * This programm is free software; you can redistribute it and/or modify
 * it under the terms of the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package eu.dicodeproject.analysis.lucene;

import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;

import java.io.IOException;
import java.io.Reader;
import java.util.regex.Pattern;

/**
 * In contrast to the Lucene standard analyser this one adds filtering tokens of less then minimum length (default
 * two characters) and tokens that contain only digits.
 */
public class CleansingAnalyzer extends Analyzer {
  /** Min number of characters for non-skipped words.*/
  private final int lowerBound;
  /** Track position increments due to skipped tokens.*/
  private final boolean enablePositionIncrement;

  /** Default init of lower bound to be equal to 2.*/
  public CleansingAnalyzer() {
    this(2, false);
  }

  public CleansingAnalyzer(int lowerBound, boolean enablePositionIncrement) {
    this.lowerBound = lowerBound;
    this.enablePositionIncrement = enablePositionIncrement;
  }

  /**
   * Delegate most of the analysis to the Lucene standard analyzer, add filtering tokens of less than minimum length
   * and filtering tokens that are digit only.
   * */
  @Override
  public TokenStream tokenStream(String fieldName, Reader reader) {
    return new CleansingFilter(
        new StopFilter(false,
          new LowerCaseFilter(
            new StandardFilter(
                new StandardTokenizer(Version.LUCENE_30, reader)))
          , StopAnalyzer.ENGLISH_STOP_WORDS_SET
          , true)
        , this.lowerBound
        , this.enablePositionIncrement
        );
  }

  /**
   * TokenFilter that throws out tokens of smaller than minimum length as well as digit only terms.
   * */
  private static class CleansingFilter extends TokenFilter {
    /** Min number of characters a token must have to not be skipped.*/
    final int lowerBound;

    /** Whether position increment should be enabled.*/
    final boolean enablePositionIncrement;

    /** RegEx pattern for digit only tokens.*/
    final Pattern digitPattern = Pattern.compile("[0-9]*");

    final TermAttribute termAtt = addAttribute(TermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

    protected CleansingFilter(TokenStream input, int lowerBound, boolean enablePositionIncrement) {
      super(input);
      this.lowerBound = lowerBound;
      this.enablePositionIncrement = enablePositionIncrement;
    }
    /**
     * Construct a token stream filtering the given input.
     */
    protected CleansingFilter(TokenStream input) {
      this(input, 2, false);
    }

    /**
     * Skip all garbage (too short, digits only) tokens. Implementation heavily inspired by Lucene's StopWordFilter.
     * */
    @Override
    public boolean incrementToken() throws IOException {
      int skippedPositions = 0;
      while (input.incrementToken()) {
        String token = new String(termAtt.termBuffer(), 0, termAtt.termLength());
        if (!isTooShort(token) && (!isDigitOnly(token))) {
          if (this.enablePositionIncrement) {
            posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
          }
          return true;
        }
        skippedPositions += posIncrAtt.getPositionIncrement();
      }
      return false;  
    }

    /** Returns true if the token is shorter than lowerBound.*/
    private boolean isTooShort(String token) {
      return token.length() <= this.lowerBound;
    }

    /** Returns true if the token is digit only.*/
    private boolean isDigitOnly(String token) {
      return digitPattern.matcher(token).matches();
    }
  }

}
